import pandas as pd
import altair as alt
from pandas_profiling import ProfileReport
from altair_data_server import data_server
# Save a vega-lite spec and a PNG blob for each plot in the notebook
alt.renderers.enable('mimetype')
# Handle large data sets without embedding them in the notebook
alt.data_transformers.enable('data_server')
DataTransformerRegistry.enable('data_server')
headernames = [
"ID",
"LIMIT_BAL",
"SEX",
"EDUCATION",
"MARRIAGE",
"AGE",
"PAY_0",
"PAY_2",
"PAY_3",
"PAY_4",
"PAY_5",
"PAY_6",
"BILL_AMT1",
"BILL_AMT2",
"BILL_AMT3",
"BILL_AMT4",
"BILL_AMT5",
"BILL_AMT6",
"PAY_AMT1",
"PAY_AMT2",
"PAY_AMT3",
"PAY_AMT4",
"PAY_AMT5",
"PAY_AMT6",
"default payment next month",
]
train_df = pd.read_csv("../data/split/train.csv", index_col=0, names=headernames, skiprows=1, encoding="utf-8")
Looking at the first and last rows of the training data.
train_df.head()
| ID | LIMIT_BAL | SEX | EDUCATION | MARRIAGE | AGE | PAY_0 | PAY_2 | PAY_3 | PAY_4 | ... | BILL_AMT4 | BILL_AMT5 | BILL_AMT6 | PAY_AMT1 | PAY_AMT2 | PAY_AMT3 | PAY_AMT4 | PAY_AMT5 | PAY_AMT6 | default payment next month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 23637 | 23638 | 200000 | 2 | 2 | 2 | 46 | 0 | 0 | 0 | 0 | ... | 136629 | 139504 | 142269 | 4342 | 4593 | 14955 | 5143 | 5166 | 5349 | 0 |
| 17169 | 17170 | 50000 | 2 | 3 | 2 | 26 | 0 | 0 | 0 | 0 | ... | 26999 | 27748 | 28351 | 1700 | 2000 | 3000 | 1500 | 1200 | 1136 | 0 |
| 15955 | 15956 | 210000 | 2 | 1 | 2 | 30 | 0 | 0 | 0 | 0 | ... | 5125 | 5732 | 9297 | 2000 | 2500 | 2000 | 1000 | 4000 | 4000 | 0 |
| 21486 | 21487 | 90000 | 2 | 1 | 2 | 27 | 0 | 0 | 2 | 0 | ... | 50611 | 52764 | 55898 | 4000 | 0 | 3000 | 3000 | 4000 | 2043 | 0 |
| 12211 | 12212 | 60000 | 1 | 2 | 1 | 40 | 0 | 0 | 0 | 0 | ... | 22564 | 24191 | 25778 | 1500 | 1500 | 2000 | 2000 | 2000 | 2000 | 1 |
5 rows × 25 columns
train_df.tail()
| ID | LIMIT_BAL | SEX | EDUCATION | MARRIAGE | AGE | PAY_0 | PAY_2 | PAY_3 | PAY_4 | ... | BILL_AMT4 | BILL_AMT5 | BILL_AMT6 | PAY_AMT1 | PAY_AMT2 | PAY_AMT3 | PAY_AMT4 | PAY_AMT5 | PAY_AMT6 | default payment next month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4426 | 4427 | 110000 | 2 | 2 | 1 | 28 | 0 | 0 | 0 | 0 | ... | 45579 | 46843 | 48062 | 2027 | 2500 | 2000 | 2000 | 2000 | 1600 | 0 |
| 12695 | 12696 | 20000 | 2 | 3 | 2 | 38 | -1 | -1 | 2 | 0 | ... | 3668 | 3508 | 5278 | 5015 | 0 | 0 | 3508 | 5278 | 780 | 0 |
| 3360 | 3361 | 150000 | 2 | 2 | 1 | 42 | 0 | 0 | 0 | 0 | ... | 138016 | 144084 | 136604 | 6022 | 5022 | 5100 | 10500 | 0 | 10000 | 0 |
| 18283 | 18284 | 190000 | 1 | 1 | 1 | 54 | 0 | 0 | 0 | 0 | ... | 192803 | 161324 | 156576 | 7154 | 7385 | 7520 | 5593 | 6000 | 5516 | 0 |
| 28564 | 28565 | 100000 | 2 | 1 | 1 | 36 | -2 | -2 | -2 | -2 | ... | 15156 | 13920 | 2099 | 3514 | 3400 | 15156 | 13920 | 2099 | 15638 | 0 |
5 rows × 25 columns
There are 24000 records in the training data set and no missing values in any rows or columns.
train_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 24000 entries, 23637 to 28564 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 24000 non-null int64 1 LIMIT_BAL 24000 non-null int64 2 SEX 24000 non-null int64 3 EDUCATION 24000 non-null int64 4 MARRIAGE 24000 non-null int64 5 AGE 24000 non-null int64 6 PAY_0 24000 non-null int64 7 PAY_2 24000 non-null int64 8 PAY_3 24000 non-null int64 9 PAY_4 24000 non-null int64 10 PAY_5 24000 non-null int64 11 PAY_6 24000 non-null int64 12 BILL_AMT1 24000 non-null int64 13 BILL_AMT2 24000 non-null int64 14 BILL_AMT3 24000 non-null int64 15 BILL_AMT4 24000 non-null int64 16 BILL_AMT5 24000 non-null int64 17 BILL_AMT6 24000 non-null int64 18 PAY_AMT1 24000 non-null int64 19 PAY_AMT2 24000 non-null int64 20 PAY_AMT3 24000 non-null int64 21 PAY_AMT4 24000 non-null int64 22 PAY_AMT5 24000 non-null int64 23 PAY_AMT6 24000 non-null int64 24 default payment next month 24000 non-null int64 dtypes: int64(25) memory usage: 4.8 MB
We have a binary feature, a few categorical features, and numerical features.
train_df.describe()
| ID | LIMIT_BAL | SEX | EDUCATION | MARRIAGE | AGE | PAY_0 | PAY_2 | PAY_3 | PAY_4 | ... | BILL_AMT4 | BILL_AMT5 | BILL_AMT6 | PAY_AMT1 | PAY_AMT2 | PAY_AMT3 | PAY_AMT4 | PAY_AMT5 | PAY_AMT6 | default payment next month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 24000.000000 | 24000.000000 | 24000.000000 | 24000.000000 | 24000.000000 | 24000.000000 | 24000.000000 | 24000.000000 | 24000.000000 | 24000.000000 | ... | 24000.000000 | 24000.000000 | 24000.000000 | 24000.000000 | 2.400000e+04 | 24000.000000 | 24000.00000 | 24000.000000 | 24000.000000 | 24000.000000 |
| mean | 15012.940792 | 167338.833333 | 1.602542 | 1.850875 | 1.551208 | 35.516833 | -0.012708 | -0.130250 | -0.163958 | -0.220458 | ... | 43232.172750 | 40292.983542 | 38784.349208 | 5641.261958 | 5.873889e+03 | 5116.207083 | 4742.45800 | 4722.876500 | 5188.472375 | 0.221500 |
| std | 8649.751001 | 129933.404625 | 0.489382 | 0.788139 | 0.521430 | 9.216367 | 1.122875 | 1.196806 | 1.197450 | 1.168901 | ... | 64301.779315 | 60868.199998 | 59615.313087 | 15971.770984 | 2.209639e+04 | 15582.324583 | 15175.92081 | 15032.940888 | 17817.373001 | 0.415265 |
| min | 1.000000 | 10000.000000 | 1.000000 | 0.000000 | 0.000000 | 21.000000 | -2.000000 | -2.000000 | -2.000000 | -2.000000 | ... | -170000.000000 | -81334.000000 | -339603.000000 | 0.000000 | 0.000000e+00 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 7529.750000 | 50000.000000 | 1.000000 | 1.000000 | 1.000000 | 28.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | ... | 2395.000000 | 1761.750000 | 1261.750000 | 1000.000000 | 8.357500e+02 | 390.750000 | 285.00000 | 264.000000 | 107.750000 | 0.000000 |
| 50% | 15014.500000 | 140000.000000 | 2.000000 | 2.000000 | 2.000000 | 34.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 18991.000000 | 18019.000000 | 16961.000000 | 2100.000000 | 2.012500e+03 | 1818.500000 | 1500.00000 | 1500.000000 | 1500.000000 | 0.000000 |
| 75% | 22476.250000 | 240000.000000 | 2.000000 | 2.000000 | 2.000000 | 41.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 54519.000000 | 50218.250000 | 48990.250000 | 5006.000000 | 5.000000e+03 | 4535.500000 | 4012.25000 | 4026.000000 | 4000.000000 | 0.000000 |
| max | 29999.000000 | 800000.000000 | 2.000000 | 6.000000 | 3.000000 | 79.000000 | 8.000000 | 7.000000 | 8.000000 | 8.000000 | ... | 706864.000000 | 823540.000000 | 699944.000000 | 505000.000000 | 1.684259e+06 | 417588.000000 | 528897.00000 | 426529.000000 | 528666.000000 | 1.000000 |
8 rows × 25 columns
train_df.columns.tolist()
['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'default payment next month']
categorical_features = [
"EDUCATION",
"MARRIAGE",
"PAY_0",
"PAY_2",
"PAY_3",
"PAY_4",
"PAY_5",
"PAY_6",
]
binary_features = ["SEX"]
drop = ["ID", "default payment next month"]
numeric_features = [
"LIMIT_BAL",
"AGE",
"BILL_AMT1",
"BILL_AMT2",
"BILL_AMT3",
"BILL_AMT4",
"BILL_AMT5",
"BILL_AMT6",
"PAY_AMT1",
"PAY_AMT2",
"PAY_AMT3",
"PAY_AMT4",
"PAY_AMT5",
"PAY_AMT6",
]
Education: Ordinal feature. 1 = graduate school; 2 = university; 3 = high school; 4 = others.
Marital status: 1 = married; 2 = single; 3 = others.
PAY_X: Ordinal feature. The history of monthly payment tracked from April to September, 2005, as follows: PAY_1 = the repayment status in September, 2005; PAY_2 = the repayment status in August, 2005; . . .;PAY_6 = the repayment status in April, 2005. The measurement scale for the repayment status is: -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above.
Sex is reported as a binary feature (1 = male; 2 = female).
ID duplicate column from the index.
default payment next month is the target column.
LIMIT_BAL: The amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family (supplementary) credit.
Age: The age of the individual (years).
BILL_AMTX: Amount of bill statement (NT dollar). BILL_AMT1 = amount of bill statement in September, 2005; BILL_AMT2 = amount of bill statement in August, 2005; . . .; BILL_AMT6 = amount of bill statement in April, 2005.
PAY_AMTX: Amount of previous payment (NT dollar). PAY_AMT1 = amount paid in September, 2005; PAY_AMT2 = amount paid in August, 2005; . . .;PAY_AMT6 = amount paid in April, 2005.
# Plotting categorical features
alt.Chart(train_df).mark_bar(opacity=0.7).encode(
x=alt.X(alt.repeat()),
y=alt.Y("count()", title="Count of records"),
color=alt.Color("default payment next month:N"),
).properties(width=200, height=100).repeat(categorical_features, columns=3)
# Plotting the binary feature.
alt.Chart(train_df).mark_bar(opacity=0.7).encode(
x=alt.X("count()", title="Count of records"),
y=alt.Y("SEX:N", title="SEX"),
color=alt.Color("default payment next month:N"),
)
# Plotting numeric features.
alt.Chart(train_df).mark_bar(opacity=0.7).encode(
x=alt.X(alt.repeat(), bin=alt.Bin(maxbins=30)),
y=alt.Y("count()", title="Count of records"),
color=alt.Color("default payment next month:N")
).properties(width=300, height=300).repeat(numeric_features, columns=3)
# Correlation matrix
train_df.corr('spearman').style.format(precision=2).background_gradient()
| ID | LIMIT_BAL | SEX | EDUCATION | MARRIAGE | AGE | PAY_0 | PAY_2 | PAY_3 | PAY_4 | PAY_5 | PAY_6 | BILL_AMT1 | BILL_AMT2 | BILL_AMT3 | BILL_AMT4 | BILL_AMT5 | BILL_AMT6 | PAY_AMT1 | PAY_AMT2 | PAY_AMT3 | PAY_AMT4 | PAY_AMT5 | PAY_AMT6 | default payment next month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | 1.00 | 0.03 | 0.02 | 0.03 | -0.03 | 0.03 | -0.02 | -0.00 | -0.01 | -0.00 | -0.01 | -0.00 | 0.02 | 0.01 | 0.02 | 0.04 | 0.02 | 0.02 | 0.02 | 0.06 | 0.09 | 0.02 | 0.01 | 0.04 | -0.01 |
| LIMIT_BAL | 0.03 | 1.00 | 0.06 | -0.27 | -0.12 | 0.19 | -0.30 | -0.34 | -0.33 | -0.31 | -0.28 | -0.26 | 0.06 | 0.05 | 0.06 | 0.08 | 0.09 | 0.09 | 0.28 | 0.28 | 0.29 | 0.29 | 0.30 | 0.32 | -0.17 |
| SEX | 0.02 | 0.06 | 1.00 | 0.02 | -0.03 | -0.09 | -0.06 | -0.08 | -0.07 | -0.07 | -0.06 | -0.05 | -0.05 | -0.05 | -0.04 | -0.03 | -0.02 | -0.01 | -0.01 | 0.00 | 0.02 | 0.01 | 0.01 | 0.03 | -0.04 |
| EDUCATION | 0.03 | -0.27 | 0.02 | 1.00 | -0.16 | 0.16 | 0.13 | 0.17 | 0.16 | 0.15 | 0.14 | 0.12 | 0.09 | 0.09 | 0.08 | 0.07 | 0.06 | 0.05 | -0.05 | -0.05 | -0.05 | -0.05 | -0.05 | -0.05 | 0.05 |
| MARRIAGE | -0.03 | -0.12 | -0.03 | -0.16 | 1.00 | -0.47 | 0.02 | 0.04 | 0.05 | 0.05 | 0.05 | 0.05 | 0.01 | 0.01 | 0.00 | 0.01 | 0.01 | 0.01 | -0.00 | -0.02 | -0.01 | -0.02 | -0.01 | -0.02 | -0.02 |
| AGE | 0.03 | 0.19 | -0.09 | 0.16 | -0.47 | 1.00 | -0.07 | -0.09 | -0.09 | -0.08 | -0.09 | -0.08 | 0.00 | 0.00 | 0.00 | -0.00 | -0.00 | -0.00 | 0.04 | 0.05 | 0.03 | 0.04 | 0.04 | 0.04 | 0.00 |
| PAY_0 | -0.02 | -0.30 | -0.06 | 0.13 | 0.02 | -0.07 | 1.00 | 0.63 | 0.55 | 0.52 | 0.48 | 0.46 | 0.31 | 0.33 | 0.31 | 0.30 | 0.30 | 0.29 | -0.10 | -0.07 | -0.06 | -0.04 | -0.03 | -0.05 | 0.29 |
| PAY_2 | -0.00 | -0.34 | -0.08 | 0.17 | 0.04 | -0.09 | 0.63 | 1.00 | 0.80 | 0.71 | 0.67 | 0.63 | 0.57 | 0.55 | 0.52 | 0.49 | 0.48 | 0.46 | 0.02 | 0.08 | 0.08 | 0.09 | 0.10 | 0.08 | 0.21 |
| PAY_3 | -0.01 | -0.33 | -0.07 | 0.16 | 0.05 | -0.09 | 0.55 | 0.80 | 1.00 | 0.80 | 0.72 | 0.67 | 0.52 | 0.59 | 0.55 | 0.53 | 0.51 | 0.48 | 0.21 | 0.03 | 0.10 | 0.12 | 0.12 | 0.10 | 0.19 |
| PAY_4 | -0.00 | -0.31 | -0.07 | 0.15 | 0.05 | -0.08 | 0.52 | 0.71 | 0.80 | 1.00 | 0.82 | 0.73 | 0.51 | 0.56 | 0.62 | 0.59 | 0.56 | 0.53 | 0.18 | 0.24 | 0.07 | 0.14 | 0.16 | 0.14 | 0.17 |
| PAY_5 | -0.01 | -0.28 | -0.06 | 0.14 | 0.05 | -0.09 | 0.48 | 0.67 | 0.72 | 0.82 | 1.00 | 0.82 | 0.50 | 0.53 | 0.58 | 0.65 | 0.62 | 0.58 | 0.17 | 0.22 | 0.26 | 0.11 | 0.19 | 0.17 | 0.16 |
| PAY_6 | -0.00 | -0.26 | -0.05 | 0.12 | 0.05 | -0.08 | 0.46 | 0.63 | 0.67 | 0.73 | 0.82 | 1.00 | 0.48 | 0.52 | 0.56 | 0.60 | 0.67 | 0.63 | 0.17 | 0.20 | 0.23 | 0.28 | 0.14 | 0.20 | 0.14 |
| BILL_AMT1 | 0.02 | 0.06 | -0.05 | 0.09 | 0.01 | 0.00 | 0.31 | 0.57 | 0.52 | 0.51 | 0.50 | 0.48 | 1.00 | 0.91 | 0.86 | 0.80 | 0.77 | 0.73 | 0.50 | 0.47 | 0.44 | 0.44 | 0.43 | 0.41 | -0.03 |
| BILL_AMT2 | 0.01 | 0.05 | -0.05 | 0.09 | 0.01 | 0.00 | 0.33 | 0.55 | 0.59 | 0.56 | 0.53 | 0.52 | 0.91 | 1.00 | 0.91 | 0.85 | 0.80 | 0.76 | 0.63 | 0.50 | 0.47 | 0.46 | 0.45 | 0.43 | -0.02 |
| BILL_AMT3 | 0.02 | 0.06 | -0.04 | 0.08 | 0.00 | 0.00 | 0.31 | 0.52 | 0.55 | 0.62 | 0.58 | 0.56 | 0.86 | 0.91 | 1.00 | 0.90 | 0.85 | 0.81 | 0.55 | 0.64 | 0.49 | 0.49 | 0.48 | 0.46 | -0.01 |
| BILL_AMT4 | 0.04 | 0.08 | -0.03 | 0.07 | 0.01 | -0.00 | 0.30 | 0.49 | 0.53 | 0.59 | 0.65 | 0.60 | 0.80 | 0.85 | 0.90 | 1.00 | 0.90 | 0.85 | 0.51 | 0.55 | 0.63 | 0.51 | 0.50 | 0.48 | -0.01 |
| BILL_AMT5 | 0.02 | 0.09 | -0.02 | 0.06 | 0.01 | -0.00 | 0.30 | 0.48 | 0.51 | 0.56 | 0.62 | 0.67 | 0.77 | 0.80 | 0.85 | 0.90 | 1.00 | 0.90 | 0.48 | 0.51 | 0.55 | 0.65 | 0.52 | 0.51 | -0.01 |
| BILL_AMT6 | 0.02 | 0.09 | -0.01 | 0.05 | 0.01 | -0.00 | 0.29 | 0.46 | 0.48 | 0.53 | 0.58 | 0.63 | 0.73 | 0.76 | 0.81 | 0.85 | 0.90 | 1.00 | 0.45 | 0.48 | 0.52 | 0.57 | 0.67 | 0.53 | -0.00 |
| PAY_AMT1 | 0.02 | 0.28 | -0.01 | -0.05 | -0.00 | 0.04 | -0.10 | 0.02 | 0.21 | 0.18 | 0.17 | 0.17 | 0.50 | 0.63 | 0.55 | 0.51 | 0.48 | 0.45 | 1.00 | 0.51 | 0.52 | 0.49 | 0.47 | 0.46 | -0.17 |
| PAY_AMT2 | 0.06 | 0.28 | 0.00 | -0.05 | -0.02 | 0.05 | -0.07 | 0.08 | 0.03 | 0.24 | 0.22 | 0.20 | 0.47 | 0.50 | 0.64 | 0.55 | 0.51 | 0.48 | 0.51 | 1.00 | 0.52 | 0.52 | 0.50 | 0.49 | -0.15 |
| PAY_AMT3 | 0.09 | 0.29 | 0.02 | -0.05 | -0.01 | 0.03 | -0.06 | 0.08 | 0.10 | 0.07 | 0.26 | 0.23 | 0.44 | 0.47 | 0.49 | 0.63 | 0.55 | 0.52 | 0.52 | 0.52 | 1.00 | 0.52 | 0.54 | 0.51 | -0.15 |
| PAY_AMT4 | 0.02 | 0.29 | 0.01 | -0.05 | -0.02 | 0.04 | -0.04 | 0.09 | 0.12 | 0.14 | 0.11 | 0.28 | 0.44 | 0.46 | 0.49 | 0.51 | 0.65 | 0.57 | 0.49 | 0.52 | 0.52 | 1.00 | 0.54 | 0.55 | -0.13 |
| PAY_AMT5 | 0.01 | 0.30 | 0.01 | -0.05 | -0.01 | 0.04 | -0.03 | 0.10 | 0.12 | 0.16 | 0.19 | 0.14 | 0.43 | 0.45 | 0.48 | 0.50 | 0.52 | 0.67 | 0.47 | 0.50 | 0.54 | 0.54 | 1.00 | 0.55 | -0.12 |
| PAY_AMT6 | 0.04 | 0.32 | 0.03 | -0.05 | -0.02 | 0.04 | -0.05 | 0.08 | 0.10 | 0.14 | 0.17 | 0.20 | 0.41 | 0.43 | 0.46 | 0.48 | 0.51 | 0.53 | 0.46 | 0.49 | 0.51 | 0.55 | 0.55 | 1.00 | -0.12 |
| default payment next month | -0.01 | -0.17 | -0.04 | 0.05 | -0.02 | 0.00 | 0.29 | 0.21 | 0.19 | 0.17 | 0.16 | 0.14 | -0.03 | -0.02 | -0.01 | -0.01 | -0.01 | -0.00 | -0.17 | -0.15 | -0.15 | -0.13 | -0.12 | -0.12 | 1.00 |
profile = ProfileReport(train_df, title="Pandas Profiling Report") # , minimal=True)
profile.to_notebook_iframe()
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]